In [4]:
    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
    
Figure 2.1
In [5]:
    
data = pd.read_csv('data/Advertising.csv')
data.head()
    
    Out[5]:
In [6]:
    
plt.figure(figsize=(18, 6))
plt.subplot(131)
sns.regplot(x='TV', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 310)
plt.subplot(132)
sns.regplot(x='Radio', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 55)
plt.subplot(133)
sns.regplot(x='Newspaper', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 110)
    
    Out[6]:
    
In [7]:
    
college = pd.read_csv('data/College.csv')
college.head()
    
    Out[7]:
In [8]:
    
college.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
college.head(2)
    
    Out[8]:
In [9]:
    
college.describe()
    
    Out[9]:
In [10]:
    
sns.pairplot(college.iloc[:, 2:11])
    
    Out[10]:
    
In [11]:
    
sns.boxplot(x='Private', y='Outstate', data=college)
    
    Out[11]:
    
In [12]:
    
college.loc[college.loc[:, 'Top10perc'] > 50, 'Elite'] = 'Yes'
college.loc[college.loc[:, 'Top10perc'] <= 50, 'Elite'] = 'No'
print(college.Elite.value_counts())
sns.boxplot(x='Elite', y='Outstate', data=college)
    
    
    Out[12]:
    
In [13]:
    
plt.figure(figsize=(10,10))
plt.subplot(221)
sns.distplot(college['Apps'], kde=False, bins=20)
plt.subplot(222)
sns.distplot(college['Apps'], kde=False, bins=100)
plt.subplot(223)
sns.distplot(college['Outstate'], kde=False, bins=20)
plt.subplot(224)
sns.distplot(college['Outstate'], kde=False, bins=100)
    
    Out[13]:
    
In [14]:
    
auto = pd.read_csv('data/Auto.csv')
auto.head()
    
    Out[14]:
In [15]:
    
auto.describe()
    
    Out[15]:
In [17]:
    
ss1 = auto[:10]
ss2 = auto[85:]
subset = pd.concat([ss1, ss2])
subset.describe()
    
    Out[17]:
In [18]:
    
sns.pairplot(auto.iloc[:, :8])
    
    Out[18]:
    
In [22]:
    
plt.figure(figsize=(12,6))
plt.subplot(121)
sns.boxplot(x='cylinders', y='mpg', data=auto)
plt.subplot(122)
sns.regplot(x='weight', y='mpg', data=auto, scatter_kws={'color': 'red'})
    
    Out[22]:
    
In [24]:
    
boston = pd.read_csv('data/Boston.csv')
boston.describe()
    
    Out[24]:
In [25]:
    
sns.pairplot(boston[1:])
    
    Out[25]:
    
In [27]:
    
boston.chas.sum() # value is 1 if next to Charles otherwise value is zero so sum is number next to Charles
    
    Out[27]:
In [28]:
    
boston.ptratio.median()
    
    Out[28]:
In [30]:
    
boston[boston.medv == boston.medv.min()]
    
    Out[30]:
In [31]:
    
boston[boston.rm >= 7].describe()
    
    Out[31]:
In [32]:
    
boston[boston.rm >= 8].describe()
    
    Out[32]:
In [ ]: